Attempting to do some problem solving.
Read in the gapminder_clean.csv data as a tibble using read_csv.
Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap for the filtered data.
data <- read.csv("C:/Users/User/Desktop/Github/RforDataScience/gapminder_clean.csv")
y1962 <- data %>%
filter(Year == 1962) %>%
filter(!is.na(gdpPercap)) %>%
filter(!is.na(CO2.emissions..metric.tons.per.capita.)) %>%
as_tibble()
head(y1962)
## # A tibble: 6 × 20
## X Country.Name Year Agriculture..value.added....…¹ CO2.emissions..metri…²
## <int> <chr> <int> <dbl> <dbl>
## 1 0 Afghanistan 1962 NA 0.0738
## 2 10 Albania 1962 NA 1.44
## 3 20 Algeria 1962 NA 0.485
## 4 50 Angola 1962 NA 0.216
## 5 80 Argentina 1962 NA 2.52
## 6 110 Australia 1962 NA 8.84
## # ℹ abbreviated names: ¹Agriculture..value.added....of.GDP.,
## # ²CO2.emissions..metric.tons.per.capita.
## # ℹ 15 more variables:
## # Domestic.credit.provided.by.financial.sector....of.GDP. <dbl>,
## # Electric.power.consumption..kWh.per.capita. <dbl>,
## # Energy.use..kg.of.oil.equivalent.per.capita. <dbl>,
## # Exports.of.goods.and.services....of.GDP. <dbl>, …
plot1962 <- ggplot(y1962, aes(x=CO2.emissions..metric.tons.per.capita., y=gdpPercap))+
geom_point()
ggplotly(plot1962)
gdpvar <- y1962["gdpPercap"][[1]]
co2var <- y1962["CO2.emissions..metric.tons.per.capita."][[1]]
cor.test(co2var, gdpvar,
method="pearson")
##
## Pearson's product-moment correlation
##
## data: co2var and gdpvar
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8934697 0.9489792
## sample estimates:
## cor
## 0.9260817
For calculating the correlation between two continuous variables, Pearson’s product-moment correlation seemed appropriate.
cordata <- data.frame()
years <- data %>%
filter(!is.na(gdpPercap)) %>%
group_by(Year) %>% summarise(avg=mean(gdpPercap))
## used for counting how many years.
for (i in 1:lengths(years[1])){
cyear <- years[i,1][[1]]
moddata <- data %>%
filter(Year == cyear)
gdpvar <- moddata["gdpPercap"][[1]]
co2var <- moddata["CO2.emissions..metric.tons.per.capita."][[1]]
cor <- cor.test(co2var, gdpvar, method="pearson")[[4]]
pval <- cor.test(co2var, gdpvar, method="pearson")[[3]]
corresult <- data.frame(year = c(cyear), correlation=c(cor), 'p value'=c(pval))
cordata <- rbind(cordata,corresult)
}
rownames(cordata) <- NULL
cordata <- cordata %>% arrange(desc(correlation))
cordata
## year correlation p.value
## 1 1967 0.9387918 3.397143e-53
## 2 1962 0.9260817 1.128679e-46
## 3 1972 0.8428986 1.824292e-32
## 4 1982 0.8166384 5.565916e-29
## 5 1987 0.8095531 3.899627e-28
## 6 1992 0.8094316 1.610614e-29
## 7 1997 0.8081396 7.976156e-30
## 8 2002 0.8006421 3.863564e-29
## 9 1977 0.7928336 2.838892e-26
## 10 2007 0.7204169 9.232747e-22
y1967 <- data %>% filter(Year == 1967) %>% filter(!is.na(gdpPercap))
plot67 <- ggplot(y1967, aes(x=CO2.emissions..metric.tons.per.capita., y=gdpPercap, size=pop, color=continent))+
geom_point()+
xlab("CO2 emissions (metric tons per capita)")+
ylab("GDP per capita")+
labs(title="Correlation between CO2 emission and GDP per capita", size = "")
ggplotly(plot67)
cont_energy <- data %>%
filter(!is.na(Electric.power.consumption..kWh.per.capita.)) %>%
filter(continent!="") %>%
subset(select = c("Electric.power.consumption..kWh.per.capita.", "continent"))
elec<- cont_energy$Electric.power.consumption..kWh.per.capita.
cont<- cont_energy$continent
africa_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Africa"]
america_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Americas"]
asia_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Asia"]
europe_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Europe"]
oceania_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Oceania"]
shapiro.test(africa_energy)
##
## Shapiro-Wilk normality test
##
## data: africa_energy
## W = 0.59218, p-value < 2.2e-16
kruskal.test(Electric.power.consumption..kWh.per.capita.~continent, data=cont_energy)
##
## Kruskal-Wallis rank sum test
##
## data: Electric.power.consumption..kWh.per.capita. by continent
## Kruskal-Wallis chi-squared = 353.23, df = 4, p-value < 2.2e-16
pconsume <- ggplot(cont_energy, aes(x=continent,y=Electric.power.consumption..kWh.per.capita.))+
geom_boxplot()+
ylab("Electric power consumption kWh per capita")+
xlab("Continents")
ggplotly(pconsume)
library(dunn.test)
dunn.test(elec, cont, method = "bonferroni")
## Kruskal-Wallis rank sum test
##
## data: elec and cont
## Kruskal-Wallis chi-squared = 353.2299, df = 4, p-value = 0
##
##
## Comparison of elec by cont
## (Bonferroni)
## Col Mean-|
## Row Mean | Africa Americas Asia Europe
## ---------+--------------------------------------------
## Americas | -6.474696
## | 0.0000*
## |
## Asia | -5.544423 0.906730
## | 0.0000* 1.0000
## |
## Europe | -17.28571 -10.36770 -11.29266
## | 0.0000* 0.0000* 0.0000*
## |
## Oceania | -8.703935 -5.861469 -6.255830 -1.648792
## | 0.0000* 0.0000* 0.0000* 0.4960
##
## alpha = 0.05
## Reject Ho if p <= alpha/2
There is no correlation test for a continuous variable and a nominal variable. Therefore, I checked whether there were statistically significant differences among the continents.
Kruskal Wallis test was appropriate as it did not pass normality test. Further analysis were performed using Dunn’s test.
Differences in electrical power consumption among different continents are statistically significant. (Kruskal Wallis test / Dunn’s test)
While there were significant differences among the continents, there were 2 exceptions :
No significant difference between Asia’s power consumption and that of Americas were found.
No significant difference between Europe’s power consumption and that of Oceania were found.
AsEuImport <- data %>%
filter(Year > 1990) %>%
filter(((continent=="Europe") | (continent=="Asia"))) %>%
subset(select= c("continent","Imports.of.goods.and.services....of.GDP.")) %>%
filter(!is.na(Imports.of.goods.and.services....of.GDP.))
shapiro.test(AsEuImport$Imports.of.goods.and.services....of.GDP.[AsEuImport$continent=="Asia"])
##
## Shapiro-Wilk normality test
##
## data: AsEuImport$Imports.of.goods.and.services....of.GDP.[AsEuImport$continent == "Asia"]
## W = 0.8549, p-value = 2.31e-08
## shapiro test failed
wilcox.test(Imports.of.goods.and.services....of.GDP.~continent, data=AsEuImport)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Imports.of.goods.and.services....of.GDP. by continent
## W = 5707, p-value = 0.7867
## alternative hypothesis: true location shift is not equal to 0
The data failed to pass normality test. Therefore Wilcox test were used.
There is no statistical difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990.
mdensity <- data %>%
group_by(Year) %>%
slice_max(Population.density..people.per.sq..km.of.land.area., n=1) %>%
subset(select=c("Year", "Country.Name", "Population.density..people.per.sq..km.of.land.area."))
mdensity
## # A tibble: 10 × 3
## # Groups: Year [10]
## Year Country.Name Population.density..people.per.sq..km.of.land.area.
## <int> <chr> <dbl>
## 1 1962 Monaco 11521
## 2 1967 Monaco 11648.
## 3 1972 Macao SAR, China 12714.
## 4 1977 Monaco 12904.
## 5 1982 Monaco 13814.
## 6 1987 Macao SAR, China 16133.
## 7 1992 Macao SAR, China 18890.
## 8 1997 Macao SAR, China 20602.
## 9 2002 Macao SAR, China 16451.
## 10 2007 Monaco 17523
mdensity2 <- data %>%
group_by(Year) %>%
slice_max(Population.density..people.per.sq..km.of.land.area., n=8) %>%
subset(select=c("Year", "Country.Name", "Population.density..people.per.sq..km.of.land.area."))
mdenplot <- ggplot(mdensity2, aes(x=Year, y=Population.density..people.per.sq..km.of.land.area., color=Country.Name))+
geom_point()
ggplotly(mdenplot)
Monaco and China have the highest ‘Population density (people per sq. km of land area)’ across all years.
birth1962 <- data %>%
filter(Year==1962) %>%
subset(select=c("Country.Name", "Life.expectancy.at.birth..total..years.")) %>%
rename('Life expectancy 1962' = Life.expectancy.at.birth..total..years.)
birth2007 <- data %>%
filter(Year==2007) %>%
subset(select= c("Country.Name", "Life.expectancy.at.birth..total..years.")) %>%
rename('Life expectancy 2007' = Life.expectancy.at.birth..total..years.)
merged <- merge(birth1962,birth2007, by = "Country.Name") %>%
mutate(`Life expectancy increase` = `Life expectancy 2007` - `Life expectancy 1962`) %>%
arrange(desc(`Life expectancy increase`))
head(merged)
## Country.Name Life expectancy 1962 Life expectancy 2007
## 1 Maldives 38.48356 75.39971
## 2 Bhutan 33.09415 66.29310
## 3 Timor-Leste 34.73905 65.82420
## 4 Tunisia 43.34168 74.20244
## 5 Oman 44.30051 75.12361
## 6 Nepal 35.95229 66.55193
## Life expectancy increase
## 1 36.91615
## 2 33.19895
## 3 31.08515
## 4 30.86076
## 5 30.82310
## 6 30.59963